# To do:
# Clean the data:
# - Remove rows with missing values
# - Transform variables into numbers (yes / no -> 1 / 0)
# Inspect data:
# - Did students answer(?)
# - Choose according hypotheses (do we include families, working people, etc.) - can we even compare students with
# other groups of people?
# Prepare for regression:
# - Check for assumptions
# Run regression
# Check the quality of regression
# Make conclusions
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv(r'C:\Users\Lukas\OneDrive\University\Year 3\Marketing\Project\Deliveroo_data_cleaned.csv')
Latitude
Longitude
Male (0 is female)
Age
Are you currently a student? (1 is a yes, 2 is a no)
Do you currently live with other members of your family?
How many people do you live with?
Approximately how far away from the city centre (downtown) do you live? (Km)
How many times a month do you usually shop for groceries?
How many times a month do you forget a particular item and have to return to the supermarket?
How many times a month do you order something online?
How many times a month do you order restaurant food online?
Do you order groceries online? *
If yes to the previous question, how many times a month?
Would you be interested in using this type of delivery service in general, given a fair price? *
What would be the MAXIMUM delivery fee you would be willing to pay for efficiently delivered grocery store items?
# Dropping NaN values that were not made by mistake (because of questionnaire design)
# - truly unanswered
df = df.dropna(subset=['Latitude', 'Longitude', 'Male', 'Age', 'is_student',
'live_with_family', 'order_groceries_online', 'fast_groceries_would_use'])
# Fill the NaN values with 0's (because it's a slider question)
# - NaN means they meant 0
df.fillna(value=0, inplace=True)
# Maximum fee column is different, 0 value would not be interpreted as won't use, but like
# won't pay, therefore it is recoded into 'Wont use'
# - 0 is not a valid fee to pay
# df.loc[df['maximum_fee_for_fast_groceries_delivery'] == 0,
# 'maximum_fee_for_fast_groceries_delivery'] = 'Wont use'
# Replace 2 (which is a 'No') with 0
tr = ['Male', 'is_student', 'live_with_family',
'order_groceries_online', 'fast_groceries_would_use']
for column in tr:
df.loc[df[column] == 2, column] = 0
df.describe()
plt.figure(figsize=(15, 15))
sns.pairplot(df[['amount_people_living_with',
'distance_from_city_center', 'shop_for_groceries_per_month',
'forget_an_item_per_month', 'order_something_online_per_month',
'order_restaurant_food_online_per_month']])
df['Male'].value_counts()
Nothing to be derived from gender distribution
df['Age'].value_counts().head(8)
Old people?.ipynb_checkpoints/
df['is_student'].value_counts()
Good amount of students and non students
df['live_with_family'].value_counts()
df['amount_people_living_with'].value_counts()
df['distance_from_city_center'].value_counts()
df['shop_for_groceries_per_month'].value_counts()
df['forget_an_item_per_month'].value_counts()
df['order_something_online_per_month'].value_counts()
df['order_restaurant_food_online_per_month'].value_counts()
df['order_groceries_online'].value_counts()
df['order_groceries_online_per_month'].value_counts()
df['fast_groceries_would_use'].value_counts()
Students are more likely to choose
Dependent variable to be binary
Binary logistic regression requires the dependent variable to be binary and ordinal logistic regression requires the dependent variable to be ordinal. And a value of 1 should represent the desired outcome.
Independent observations
Logistic regression requires the observations to be independent of each other. In other words, the observations should not come from repeated measurements or matched data.
No multicollinearity
Logistic regression requires there to be little or no multicollinearity among the independent variables. This means that the independent variables should not be too highly correlated with each other.
Linearity
Logistic regression assumes linearity of independent variables and log odds. although this analysis does not require the dependent and independent variables to be related linearly, it requires that the independent variables are linearly related to the log odds.
Large sample size
Logistic regression typically requires a large sample size. A general guideline is that you need at minimum of 10 cases with the least frequent outcome for each independent variable in your model. For example, if you have 5 independent variables and the expected probability of your least frequent outcome is .10, then you would need a minimum sample size of 500 (10*5 / .10)
Meaningful variables
Only the meaningful variables should be included.
Lack of outliers
# Creating temporary df without Latitude and Longtitude
temp = df.drop(columns=['Latitude', 'Longitude'])
def var_corr(varlist, savefig=False, dpi=300):
'''
Takes a list of independent variables, creates an image of
a heatmap plot of the correlation values.
'''
# Creating a correlation table
corr = temp.corr()
# Creating a mask for the heatmap (removes upper part of the triangle - redundant information)
mask = np.zeros_like(corr, dtype=np.bool) # Copy the df shape and fill it with 0's
mask[np.triu_indices_from(mask)] = True # Select upper triangle of array
# Creating a heatmap
plt.figure(figsize=(10, 10))
ax = sns.heatmap(
corr,
vmin=-1, vmax=1, center=0,
cbar_kws={"ticks": [-1, -0.5, 0, 0.5, 1]}, # setting colorbar tick values
cmap=sns.diverging_palette(20, 220, n=200),
square=True,
annot=True,
mask=mask)
# Tweaking X-axis labels
ax.set_xticklabels(
ax.get_xticklabels(),
rotation=45,
horizontalalignment='right')
# Tweaking Y-axis labels
ax.set_yticklabels(
ax.get_yticklabels(),
rotation=0)
# Saving the figure
if savefig == True:
plt.savefig('Correlation heatmap.png', dpi=dpi)
var_corr(temp)
# Import function
def sm_vif(x, savefig=False, dpi=300):
'''
Takes independent variables, calculates their VIF for
multicorrelation, shows the value table and generates an image.
'''
from statsmodels.stats.outliers_influence import variance_inflation_factor
# Get variables for which to compute VIF and add intercept term
x = x
x['Intercept'] = 1 # Using statsmodels VIF calculation requires adding
# the intercept column, because it is not done so automatically, and
# otherwise will result in incorrect VIF values.
# Compute VIF in a new dataframe and view it
vif = pd.DataFrame()
vif["variables"] = x.columns
vif["VIF"] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
# Dropping the intercept row
vif = vif[vif.variables != 'Intercept']
# View results using print
print(vif)
# Setting figure size
plt.figure(figsize=(8, 4))
# Plotting a horizontal bar chart
b = plt.barh(y=vif['variables'], width=vif['VIF'],
height=0.5, color='darkorange')
# adjusting
plt.title('VIF values', fontsize=20)
plt.xlabel('Value', fontsize=15)
plt.ylabel('Variable', fontsize=15)
# Removing top and right part of the frame
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
# Finding y-line height according to how many variables there are
yline = vif.variables.count()
# draw vertical line from (70, 100) < x ,x coordinates to (70, 250) < y, y coordinates
plt.plot([5, 5], [-0.4, yline], '--', lw=2, color='darkred')
# Setting x value limits (plt.xlim used instead of b.set_xlim because of barh type of chart)
plt.xlim([0, 5.5])
# Saving the figure
if savefig == True:
plt.savefig('VIF values graph.png', dpi=dpi)
sm_vif(temp)
def find_del_outliers(data, df):
'''
Finds, deletes and plots outliers.
'''
def find_outliers(data):
'''
Accept an array from a dataframe and return a list of outliers.
'''
# Set upper and lower limit to 3 standard deviation
data_std = data.std()
data_mean = data.mean()
outlier_cut_off = data_std * 3
lower_limit = data_mean - outlier_cut_off
upper_limit = data_mean + outlier_cut_off
print(f'Upper limit: {upper_limit}')
print(f'lower limit: {lower_limit}')
# Populating a list with outliers
outliers = []
for an_outlier in data:
if an_outlier > upper_limit or an_outlier < lower_limit:
outliers.append(an_outlier)
print(f'Outliers: {outliers}')
def remove_outliers_array(data):
'''
Accept an array from a dataframe, remove outliers, return cleaned data as a new array.
'''
cleaned_array = data[(data > lower_limit) & (data < upper_limit)]
return cleaned_array
def drop_outliers_df(data, df):
'''
Accept an array from a dataframe and the dataframe, remove outliers, return cleaned data as a new dataframe.
'''
cleaned_df = df[(data > lower_limit) & (data < upper_limit)]
return cleaned_df
# Calling inner functions
cleaned_array = remove_outliers_array(data)
cleaned_df = drop_outliers_df(data, df)
return outliers, cleaned_array, cleaned_df # Has to contain everything that the final return contains
# Returning results of both functions
# Calling the function
outliers, cleaned_array, cleaned_df = find_outliers(data)
return outliers, cleaned_array, cleaned_df
found_outliers, cleaned_array, cleaned_df = find_del_outliers(df['forget_an_item_per_month'], df)
# Box plot outlier visualization
# sns.boxplot(data= df[['gpa', 'rank']]).set_title("GPA and Rank Box Plot") # Example
# Continuous var columns
continuous_columns = ['amount_people_living_with',
'distance_from_city_center', 'shop_for_groceries_per_month',
'forget_an_item_per_month', 'order_something_online_per_month',
'order_restaurant_food_online_per_month',
'order_groceries_online_per_month',
'maximum_fee_for_fast_groceries_delivery']
def print_outliers(continuous_columns):
'''
Prints column name, upper and lower limit (3 S.D.) and a list of outliers.
(Dependancy on find_del_outliers.)
'''
for column in cont_columns:
print(column)
found_outliers = find_del_outliers(df[column], df)
print('\n')
print_outliers(continuous_columns)
Logistic regression does not require the continuous IV(s) to be linearly related to the DV. It does require the continuous IV(s) be linearly related to the log odds of the IV though. A way to test this is to plot the IV(s) in question and look for an S-shaped curve. Sometimes the S-shape will not be obvious. The plot should have a flat or flat-ish top and bottom with an increase or decreasing middle.
def linearity_log_odds_vs_indep_vars(df, dep_var_string, string_list_of_ind_vars):
'''
Takes df name, dependent var name and a list of independent variable names and
visually exames the linearity between independent variables and dependent log odds.
'''
plt.figure(figsize=(10, 10))
# fig, axs = plt.subplots(ncols=int(len(string_list_of_ind_vars)/2), nrows=int(len(string_list_of_ind_vars)/2))
counter = 0
for variable in string_list_of_ind_vars:
plt.figure(variable)
sns.regplot(variable, dep_var_string, data=df, logistic=True).set_title(variable)
counter += 1
# sns.regplot(df[string_list_of_ind_vars[1]], df[dep_var_string], logistic=False).set_title(string_list_of_ind_vars[1])
# Visually checking if the linearity is present
continuous_columns = ['amount_people_living_with',
'distance_from_city_center', 'shop_for_groceries_per_month',
'forget_an_item_per_month', 'order_something_online_per_month',
'order_restaurant_food_online_per_month',
'order_groceries_online_per_month',
'maximum_fee_for_fast_groceries_delivery']
linearity_log_odds_vs_indep_vars(df, 'fast_groceries_would_use', continuous_columns)
def logistic_reg_sample(df, dependent_var_column_name, independent_var_count):
'''
Takes the dependent variable column (finds the ratio between 0 and 1 values of the dependent variable) and independent
variable count, and returns & prints the minimal sample size.
'''
# Finding the minimum and maximum value counts, or how many observations of each category of the bivariate variable there are.
minority_count = df[dependent_var_column_name].value_counts().min()
majority_count = df[dependent_var_column_name].value_counts().max()
# Calculating the ratio
total_count = minority_count + majority_count
ratio = minority_count / total_count
# Calculating the minimum sample size
min_sample = (10 * independent_var_count) / ratio
# Calculating if the current sample size large enough
if total_count >= min_sample:
print('Sample size large enough. PASS\n')
else:
print('Sample size not large enough. FAIL\n')
# Printing out the results
print('The closer the ratio of the dependent variable categories is to 50/50, the less observations are needed.')
print('The larger the amount of independent variables, the more observations are needed.\n')
print(f'Minimal recommended sample size is: {int(min_sample)}')
logistic_reg_sample(df, 'fast_groceries_would_use', 12)
R = np.eye(len(logistic_model.params))
logistic_model.wald_test(R)
print(R)
# Regression variables
# Original list
# X = df[['Male', 'Age', 'is_student',
# 'live_with_family', 'amount_people_living_with',
# 'distance_from_city_center', 'shop_for_groceries_per_month',
# 'forget_an_item_per_month', 'order_something_online_per_month',
# 'order_restaurant_food_online_per_month', 'order_groceries_online',
# 'order_groceries_online_per_month', 'fast_groceries_would_use',
# 'maximum_fee_for_fast_groceries_delivery']]
X = df[['Male', 'Age', 'is_student',
'live_with_family', 'amount_people_living_with',
'distance_from_city_center', 'shop_for_groceries_per_month',
'forget_an_item_per_month', 'order_something_online_per_month',
'order_restaurant_food_online_per_month', 'order_groceries_online',
'order_groceries_online_per_month']]
Y = df['fast_groceries_would_use']
# Running the regression
X = sm.add_constant(X) # add an intercept (beta_0) to our model (which is 1.0)
logistic_model = sm.Logit(Y, X).fit(cov_type="HC1")
# Print out the regression output
# print(logistic_model.summary()) # Expanded summary
print(logistic_model.summary2()) # Brief summary
# Unofficial quality check criteria
# Finding the predicted values
predicted_raw = logistic_model.predict()
# If predicted value is > 0.5 then the prediction is turned to 1 (or 'Yes'), and vice versa
predicted = []
for value in predicted_raw:
if value > 0.5:
predicted.append(1)
elif value < 0.5:
predicted.append(0)
# Assigning predicted values to the data frame
df['predicted'] = predicted
# Making a column with 'True' if the prediction is correct and 'False' if it's wrong
df['is_true'] = df['fast_groceries_would_use'] == df['predicted']
# Displaying the results
df['is_true'].head(5)
# Counting the amount of correct and incorrect guesses
acc = df[['predicted', 'is_true']].groupby('is_true').count()
print(acc)
# Confusion matrix
cm = logistic_model.pred_table()
print('\nConfusion matrix: \n', cm)
# Calculating the accuracy of the model
accuracy = (acc.iloc[1, 0] / (acc.iloc[0, 0] + acc.iloc[1, 0]))*100
print(f'\nAccuracy: {accuracy:0.1f} %')
Interpreting the log odds is not very straight forward when thinking about it’s effects. An easier way to interpret the findings is by converting the coefficients of the logistic regression model into odd ratios. This can be done by getting the exponent of the coefficient value.
# GETTING THE ODDS RATIOS, Z-VALUE, AND 95% CI
model_odds = pd.DataFrame(np.exp(logistic_model.params), columns= ['Odds ratio']) # 1 is that it is going to happen(?)
model_odds['z-value']= logistic_model.pvalues
model_odds[['2.5%', '97.5%']] = np.exp(logistic_model.conf_int())
model_odds
# Finding edges of the map (adding or subtracting 2, to zoom out from the area which has values,
# to get a more bird-eye-view picture)
zoom = 0.25
lat_min = 50.988007 - zoom
lat_max = df['Latitude'].max() + zoom # Removed because of an outlier
lon_min = df['Longitude'].min() - zoom
lon_max = 6.816895 + zoom # Removed because of an outlier + 2
print(lat_min, lat_max, lon_min, lon_max)
df['Latitude'].nsmallest(5) # Returns a series object with a sorted list of lowest values (nsmallest / nlargest)
# Removing coordinate outliers
df_new = remove_outlier(df, 'Latitude')
def scatter_on_map(lat, lon, savefig=False, dpi=300, figsize=(15, 8),
lon_min=0, lon_max=0, lat_min=0, lat_max=0):
'''
Plot a map and scatter points on top of it, generate an image.
'''
import cartopy.crs as ccrs
import cartopy.feature as cfeature
lat = lat
lon = lon
##### Setting figure size
plt.figure(figsize=figsize)
# Creating a projection
ax = plt.axes(projection=ccrs.PlateCarree())
# Set background image (Check https://visibleearth.nasa.gov/collection/1484/blue-marble for more background images)
# ax.stock_img() # Setting background to stock image
# Zooming in on a specified area
if lon_min != 0:
ax.set_extent([lon_min, lon_max, lat_min, lat_max])
# Removing image size limits
from PIL import Image
Image.MAX_IMAGE_PIXELS = None
# Setting up the environment directory
import os # Kernel needs to be reset if changes to image.json are made
os.environ["CARTOPY_USER_BACKGROUNDS"] = r'C:\Users\Lukas\OneDrive\Projects\WB\Articles\Data Analysis\Geographical data\Meteor_data_visualization-master\cartopy_metadata'
# Setting the image as a background
ax.background_img(name='BM', resolution='high')
# Removing everything but the scatter plot
plt.axis('off')
# Creating a scatter plot
plt.scatter(lon, lat, s=90, alpha=0.6,
marker='.', color='red', edgecolors='white', linewidths=0.5,
transform=ccrs.PlateCarree())
# Saving the figure
if savefig == True:
plt.savefig('Scatter on map.png', bbox_inches='tight', dpi=dpi)
# scatter_on_map(lat, lon, lon_min=lon_min, lon_max=lon_max, lat_min=lat_min, lat_max=lat_max)
# Import the necessaries libraries
import plotly.offline as pyo
import plotly.graph_objs as go
# Set notebook mode to work in offline
pyo.init_notebook_mode()
# Plotting the respondant map
def plotly_map_scatter(lon, lat, text, title, market_color='red', geo_scope='europe'):
'''
Display an interactive plotly map with country vectors and an overlaid scatter plot.
'''
fig = go.Figure(data=go.Scattergeo(
lon = lon,
lat = lat,
text = text,
mode = 'markers',
marker_color = 'red',
))
fig.update_layout(
title = title,
geo_scope= geo_scope,
autosize=False,
width=1000,
height=1000,
margin=dict(
l=50,
r=50,
b=100,
t=100,
pad=4
))
fig.show()
lon = df['Longitude']
lat = df['Latitude']
text = df['Male']
title = 'Survey respondant location'
geo_scope='europe'
plotly_map_scatter(lon, lat, text, title)